From ad5d9c688dc5da19cfec81f6cdedd7724bb048c8 Mon Sep 17 00:00:00 2001
From: "kaf24@scramble.cl.cam.ac.uk" <kaf24@scramble.cl.cam.ac.uk>
Date: Tue, 25 Nov 2003 15:36:17 +0000
Subject: [PATCH] bitkeeper revision 1.636 (3fc376f1OnDIvL1xiIhqzjKDbchujQ)

desc.h, traps.c, sched.h, event.h, schedule.c, memory.c:
  Fixes and cleanups.
---
 xen/common/memory.c                           |   3 +-
 xen/common/schedule.c                         | 226 ++++++++----------
 xen/include/xeno/event.h                      |  38 +--
 xen/include/xeno/sched.h                      |   6 +-
 .../arch/xeno/kernel/traps.c                  |  18 ++
 .../include/asm-xeno/desc.h                   |  14 +-
 6 files changed, 153 insertions(+), 152 deletions(-)

diff --git a/xen/common/memory.c b/xen/common/memory.c
index 77fe5822a2..8cbb503cf3 100644
--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -821,8 +821,7 @@ static int do_extended_command(unsigned long ptr, unsigned long val)
         else if ( (current->mm.ldt_ents != ents) || 
                   (current->mm.ldt_base != ptr) )
         {
-            if ( current->mm.ldt_ents != 0 )
-                invalidate_shadow_ldt();
+            invalidate_shadow_ldt();
             current->mm.ldt_base = ptr;
             current->mm.ldt_ents = ents;
             load_LDT(current);
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index 5997b276a2..2b834d93e3 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -43,7 +43,6 @@ static s32 ctx_allow = (s32)MILLISECS(5);     /* context switch allowance */
 
 typedef struct schedule_data_st
 {
-    spinlock_t          lock;           /* lock for protecting this */
     struct list_head    runqueue;       /* runqueue */
     struct task_struct *curr;           /* current task */
     struct task_struct *idle;           /* idle task for this cpu */
@@ -55,6 +54,8 @@ typedef struct schedule_data_st
 } __cacheline_aligned schedule_data_t;
 static schedule_data_t schedule_data[NR_CPUS];
 
+spinlock_t schedule_lock[NR_CPUS] __cacheline_aligned;
+
 /* Skanky periodic event to all guests. This must die in the next release! */
 static struct ac_timer v_timer; 
 
@@ -128,8 +129,7 @@ void sched_add_domain(struct task_struct *p)
 
     if ( p->domain == IDLE_DOMAIN_ID )
     {
-        p->avt = 0xffffffff;
-        p->evt = 0xffffffff;
+        p->avt = p->evt = ~0U;
         schedule_data[p->processor].idle = p;
     } 
     else 
@@ -159,29 +159,21 @@ void init_idle_task(void)
 {
     unsigned long flags;
     struct task_struct *p = current;
-    spin_lock_irqsave(&schedule_data[p->processor].lock, flags);
+    spin_lock_irqsave(&schedule_lock[p->processor], flags);
     p->has_cpu = 1;
     p->state = TASK_RUNNING;
     if ( !__task_on_runqueue(p) )
         __add_to_runqueue_head(p);
-    spin_unlock_irqrestore(&schedule_data[p->processor].lock, flags);
+    spin_unlock_irqrestore(&schedule_lock[p->processor], flags);
 }
 
 
-/*
- * wake up a domain which had been sleeping
- */
-int wake_up(struct task_struct *p)
+void __wake_up(struct task_struct *p)
 {
-    unsigned long flags;
-    int ret = 0;
+    ASSERT(p->state != TASK_DYING);
 
-    spin_lock_irqsave(&schedule_data[p->processor].lock, flags);
-
-    /* XXX RN: should we warp here? Might be a good idea to also boost a 
-     * domain which currently is unwarped and on run queue and 
-     * the receives an event. */
-    if ( __task_on_runqueue(p) ) goto out;
+    if ( unlikely(__task_on_runqueue(p)) )
+        return;
 
     p->state = TASK_RUNNING;
     __add_to_runqueue_head(p);
@@ -198,16 +190,17 @@ int wake_up(struct task_struct *p)
 #ifdef SCHED_HISTO
     p->wokenup = NOW();
 #endif
+}
 
-    ret = 1;
- out:
-    spin_unlock_irqrestore(&schedule_data[p->processor].lock, flags);
-    return ret;
+void wake_up(struct task_struct *p)
+{
+    unsigned long flags;
+    spin_lock_irqsave(&schedule_lock[p->processor], flags);
+    __wake_up(p);
+    spin_unlock_irqrestore(&schedule_lock[p->processor], flags);
 }
 
-/*
- * Voluntarily yield the processor to another domain, until an event occurs.
- */
+/* Voluntarily yield the processor to another domain, until an event occurs. */
 long do_yield(void)
 {
     current->state = TASK_INTERRUPTIBLE;
@@ -216,9 +209,7 @@ long do_yield(void)
     return 0;
 }
 
-/*
- *  Demultiplex scheduler-related hypercalls.
- */
+/* Demultiplex scheduler-related hypercalls. */
 long do_sched_op(unsigned long op)
 {
     long ret = 0;
@@ -251,18 +242,14 @@ long do_sched_op(unsigned long op)
     return ret;
 }
 
-/*
- * Control the scheduler
- */
+/* Control the scheduler. */
 long sched_bvtctl(unsigned long c_allow)
 {
     ctx_allow = c_allow;
     return 0;
 }
 
-/*
- * Adjust scheduling parameter for a given domain
- */
+/* Adjust scheduling parameter for a given domain. */
 long sched_adjdom(int dom, unsigned long mcu_adv, unsigned long warp, 
                  unsigned long warpl, unsigned long warpu)
 {
@@ -276,9 +263,9 @@ long sched_adjdom(int dom, unsigned long mcu_adv, unsigned long warp,
     if ( p == NULL ) 
         return -ESRCH;
 
-    spin_lock_irq(&schedule_data[p->processor].lock);   
+    spin_lock_irq(&schedule_lock[p->processor]);   
     p->mcu_advance = mcu_adv;
-    spin_unlock_irq(&schedule_data[p->processor].lock); 
+    spin_unlock_irq(&schedule_lock[p->processor]); 
 
     put_task_struct(p);
 
@@ -293,18 +280,15 @@ long sched_adjdom(int dom, unsigned long mcu_adv, unsigned long warp,
  * Otherwise we do a run through the scheduler after the current tasks 
  * context switch allowance is over.
  */
-void reschedule(struct task_struct *p)
+unsigned long __reschedule(struct task_struct *p)
 {
     int cpu = p->processor;
     struct task_struct *curr;
-    unsigned long flags;
     s_time_t now, min_time;
 
-    if ( p->has_cpu )
-        return;
+    if ( unlikely(p->has_cpu || !__task_on_runqueue(p)) )
+        return 0;
 
-    spin_lock_irqsave(&schedule_data[cpu].lock, flags);
-    
     now = NOW();
     curr = schedule_data[cpu].curr;
     /* domain should run at least for ctx_allow */
@@ -312,23 +296,26 @@ void reschedule(struct task_struct *p)
 
     if ( is_idle_task(curr) || (min_time <= now) )
     {
-        /* reschedule */
         set_bit(_HYP_EVENT_NEED_RESCHED, &curr->hyp_events);
-
-        spin_unlock_irqrestore(&schedule_data[cpu].lock, flags);
-
-        if ( cpu != smp_processor_id() )
-            smp_send_event_check_cpu(cpu);
-
-        return;
+        return (1 << p->processor);
     }
 
     /* current hasn't been running for long enough -> reprogram timer.
      * but don't bother if timer would go off soon anyway */
     if ( schedule_data[cpu].s_timer.expires > min_time + TIME_SLOP )
         mod_ac_timer(&schedule_data[cpu].s_timer, min_time);
-    
-    spin_unlock_irqrestore(&schedule_data[cpu].lock, flags);
+
+    return 0;
+}
+
+
+void reschedule(struct task_struct *p)
+{
+    unsigned long flags, cpu_mask;
+    spin_lock_irqsave(&schedule_lock[p->processor], flags);
+    cpu_mask = __reschedule(p);
+    spin_unlock_irqrestore(&schedule_lock[p->processor], flags);
+    hyp_event_notify(cpu_mask);
 }
 
 
@@ -341,9 +328,9 @@ void reschedule(struct task_struct *p)
  */
 asmlinkage void __enter_scheduler(void)
 {
-    struct task_struct *prev, *next, *next_prime, *p;
+    struct task_struct *prev = current, *next = NULL, *next_prime, *p;
     struct list_head   *tmp;
-    int                 this_cpu;
+    int                 this_cpu = prev->processor;
     s_time_t            now;
     s32                 r_time;     /* time for new dom to run */
     s32                 ranfor;     /* assume we never run longer than 2.1s! */
@@ -352,69 +339,41 @@ asmlinkage void __enter_scheduler(void)
 
     perfc_incrc(sched_run);
 
-    prev = current;
-    next = NULL;
-
-    this_cpu = prev->processor;
-
-    spin_lock_irq(&schedule_data[this_cpu].lock);
+    spin_lock_irq(&schedule_lock[this_cpu]);
 
     now = NOW();
 
-    /* remove timer, if still on list  */
     rem_ac_timer(&schedule_data[this_cpu].s_timer);
 
-    /* deschedule the current domain */
-
     ASSERT(!in_interrupt());
     ASSERT(__task_on_runqueue(prev));
+    ASSERT(prev->state != TASK_UNINTERRUPTIBLE);
 
-    if ( is_idle_task(prev) ) 
-        goto deschedule_done;
-
-    /* do some accounting */
-    ranfor = (s32)(now - prev->lastschd);
-    prev->cpu_time += ranfor;
-    
-    /* calculate mcu and update avt */
-    mcus = ranfor/MCU;
-    if (ranfor % MCU) mcus ++;  /* always round up */
-    prev->avt += mcus * prev->mcu_advance;
-
-    /* recalculate evt */
-    __calc_evt(prev);
-
-    /* dequeue */
-    __del_from_runqueue(prev);
-    
-    switch ( prev->state )
+    if ( likely(!is_idle_task(prev)) ) 
     {
-    case TASK_INTERRUPTIBLE:
-        if ( signal_pending(prev) )
+        ranfor = (s32)(now - prev->lastschd);
+        prev->cpu_time += ranfor;
+    
+        /* Calculate mcu and update avt. */
+        mcus = (ranfor + MCU - 1) / MCU;
+        prev->avt += mcus * prev->mcu_advance;
+        
+        __calc_evt(prev);
+        
+        __del_from_runqueue(prev);
+        
+        if ( likely(prev->state == TASK_RUNNING) ||
+             unlikely((prev->state == TASK_INTERRUPTIBLE) && 
+                      signal_pending(prev)) )
         {
-            prev->state = TASK_RUNNING; /* but has events pending */
-            break;
+            prev->state = TASK_RUNNING;
+            __add_to_runqueue_tail(prev);
         }
-    case TASK_UNINTERRUPTIBLE:
-    case TASK_DYING:
-    case TASK_STOPPED:
-    default:
-        /* Done if not running. Else continue. */
-        goto deschedule_done;
-    case TASK_RUNNING:;
     }
 
-    /* requeue */
-    __add_to_runqueue_tail(prev);
-    
- deschedule_done:
     clear_bit(_HYP_EVENT_NEED_RESCHED, &prev->hyp_events);
 
-    /*
-     * Pick a new domain
-     */
-
-    /* we should at least have the idle task */
+    /* We should at least have the idle task */
     ASSERT(!list_empty(&schedule_data[this_cpu].runqueue));
 
     /*
@@ -425,64 +384,76 @@ asmlinkage void __enter_scheduler(void)
     next       = schedule_data[this_cpu].idle;
     next_prime = NULL;
 
-    next_evt       = 0xffffffff;
-    next_prime_evt = 0xffffffff;
-    min_avt        = 0xffffffff;    /* to calculate svt */
+    next_evt       = ~0U;
+    next_prime_evt = ~0U;
+    min_avt        = ~0U;
 
-    list_for_each(tmp, &schedule_data[this_cpu].runqueue) {
+    list_for_each ( tmp, &schedule_data[this_cpu].runqueue )
+    {
         p = list_entry(tmp, struct task_struct, run_list);
-        if (p->evt < next_evt) {
+        if ( p->evt < next_evt )
+        {
             next_prime     = next;
             next_prime_evt = next_evt;
             next = p;
             next_evt = p->evt;
-        } else if (next_prime_evt == 0xffffffff) {
+        } 
+        else if ( next_prime_evt == ~0U )
+        {
             next_prime_evt = p->evt;
             next_prime     = p;
-        } else if (p->evt < next_prime_evt) {
+        } 
+        else if ( p->evt < next_prime_evt )
+        {
             next_prime_evt = p->evt;
             next_prime     = p;
         }
-        /* determine system virtual time */
-        if (p->avt < min_avt)
+
+        /* Determine system virtual time. */
+        if ( p->avt < min_avt )
             min_avt = p->avt;
     }
-    ASSERT(next != NULL);   /* we should have at least the idle task */
 
-    /* update system virtual time  */
-    if (min_avt != 0xffffffff) schedule_data[this_cpu].svt = min_avt;
+    /* Update system virtual time. */
+    if ( min_avt != ~0U )
+        schedule_data[this_cpu].svt = min_avt;
 
     /* check for virtual time overrun on this cpu */
-    if (schedule_data[this_cpu].svt >= 0xf0000000) {
+    if ( schedule_data[this_cpu].svt >= 0xf0000000 )
+    {
         u_long t_flags; 
         write_lock_irqsave(&tasklist_lock, t_flags); 
         p = &idle0_task;
         do {
-            if (p->processor == this_cpu && !is_idle_task(p)) {
+            if ( (p->processor == this_cpu) && !is_idle_task(p) )
+            {
                 p->evt -= 0xe0000000;
                 p->avt -= 0xe0000000;
             }
-        } while ( (p = p->next_task) != &idle0_task );
+        } 
+        while ( (p = p->next_task) != &idle0_task );
         write_unlock_irqrestore(&tasklist_lock, t_flags); 
         schedule_data[this_cpu].svt -= 0xe0000000;
     }
 
     /* work out time for next run through scheduler */
-    if (is_idle_task(next)) {
+    if ( is_idle_task(next) ) 
+    {
         r_time = ctx_allow;
         goto sched_done;
     }
 
-    if (next_prime == NULL || is_idle_task(next_prime)) {
-        /* we have only one runable task besides the idle task */
+    if ( (next_prime == NULL) || is_idle_task(next_prime) )
+    {
+        /* We have only one runnable task besides the idle task. */
         r_time = 10 * ctx_allow;     /* RN: random constant */
         goto sched_done;
     }
 
     /*
-     * if we are here we have two runable tasks.
-     * work out how long 'next' can run till its evt is greater than
-     * 'next_prime's evt. Taking context switch allowance into account.
+     * If we are here then we have two runnable tasks.
+     * Work out how long 'next' can run till its evt is greater than
+     * 'next_prime's evt. Take context switch allowance into account.
      */
     ASSERT(next_prime->evt >= next->evt);
     r_time = ((next_prime->evt - next->evt)/next->mcu_advance) + ctx_allow;
@@ -491,7 +462,8 @@ asmlinkage void __enter_scheduler(void)
     ASSERT(r_time >= ctx_allow);
 
 #ifndef NDEBUG
-    if (r_time < ctx_allow) {
+    if ( r_time < ctx_allow )
+    {
         printk("[%02d]: %lx\n", this_cpu, (unsigned long)r_time);
         dump_rqueue(&schedule_data[this_cpu].runqueue, "foo");
     }
@@ -508,7 +480,7 @@ asmlinkage void __enter_scheduler(void)
     schedule_data[this_cpu].s_timer.expires  = now + r_time;
     add_ac_timer(&schedule_data[this_cpu].s_timer);
 
-    spin_unlock_irq(&schedule_data[this_cpu].lock);
+    spin_unlock_irq(&schedule_lock[this_cpu]);
 
     /* done, switch tasks */
     if ( unlikely(prev == next) )
@@ -610,7 +582,7 @@ void __init scheduler_init(void)
     for ( i = 0; i < NR_CPUS; i++ )
     {
         INIT_LIST_HEAD(&schedule_data[i].runqueue);
-        spin_lock_init(&schedule_data[i].lock);
+        spin_lock_init(&schedule_lock[i]);
         schedule_data[i].curr = &idle0_task;
         
         init_ac_timer(&schedule_data[i].s_timer);
@@ -688,10 +660,10 @@ void dump_runq(u_char key, void *dev_id, struct pt_regs *regs)
     printk("BVT: mcu=0x%08Xns ctx_allow=0x%08Xns NOW=0x%08X%08X\n",
            (u32)MCU, (u32)ctx_allow, (u32)(now>>32), (u32)now); 
     for (i = 0; i < smp_num_cpus; i++) {
-        spin_lock_irqsave(&schedule_data[i].lock, flags);
+        spin_lock_irqsave(&schedule_lock[i], flags);
         printk("CPU[%02d] svt=0x%08X ", i, (s32)schedule_data[i].svt);
         dump_rqueue(&schedule_data[i].runqueue, "rq"); 
-        spin_unlock_irqrestore(&schedule_data[i].lock, flags);
+        spin_unlock_irqrestore(&schedule_lock[i], flags);
     }
     return; 
 }
diff --git a/xen/include/xeno/event.h b/xen/include/xeno/event.h
index 6627afdf64..1e9949d376 100644
--- a/xen/include/xeno/event.h
+++ b/xen/include/xeno/event.h
@@ -28,31 +28,39 @@
  */
 static inline unsigned long mark_guest_event(struct task_struct *p, int event)
 {
+    unsigned long flags, cpu_mask;
+
     if ( test_and_set_bit(event, &p->shared_info->events) )
         return 0;
 
-    /*
-     * No need for the runqueue_lock! The check below does not race
-     * with the setting of has_cpu, because that is set with runqueue_lock
-     * held. The lock must be released before hypervisor exit (and so
-     * a write barrier executed). And, just before hypervisor exit, 
-     * outstanding events are checked. So bit is certainly set early enough.
-     */
-    smp_mb();
-    if ( p->state == TASK_INTERRUPTIBLE ) wake_up(p);
-    reschedule(p);
-    return p->has_cpu ? (1 << p->processor) : 0;
+    spin_lock_irqsave(&schedule_lock[p->processor], flags);
+    if ( p->state == TASK_INTERRUPTIBLE )
+        __wake_up(p);
+    cpu_mask = __reschedule(p);
+    if ( p->has_cpu )
+        cpu_mask |= 1 << p->processor;
+    spin_unlock_irqrestore(&schedule_lock[p->processor], flags);
+
+    return cpu_mask;
 }
 
 /* As above, but hyp_events are handled within the hypervisor. */
 static inline unsigned long mark_hyp_event(struct task_struct *p, int event)
 {
+    unsigned long flags, cpu_mask;
+
     if ( test_and_set_bit(event, &p->hyp_events) )
         return 0;
-    smp_mb();
-    if ( p->state == TASK_INTERRUPTIBLE ) wake_up(p);
-    reschedule(p);
-    return p->has_cpu ? (1 << p->processor) : 0;
+
+    spin_lock_irqsave(&schedule_lock[p->processor], flags);
+    if ( p->state == TASK_INTERRUPTIBLE )
+        __wake_up(p);
+    cpu_mask = __reschedule(p);
+    if ( p->has_cpu )
+        cpu_mask |= 1 << p->processor;
+    spin_unlock_irqrestore(&schedule_lock[p->processor], flags);
+
+    return cpu_mask;
 }
 
 /* Notify the given set of CPUs that guest events may be outstanding. */
diff --git a/xen/include/xeno/sched.h b/xen/include/xeno/sched.h
index d4caca2cd2..b4a7520a64 100644
--- a/xen/include/xeno/sched.h
+++ b/xen/include/xeno/sched.h
@@ -233,7 +233,7 @@ extern void free_irq(unsigned int, void *);
 extern unsigned long wait_init_idle;
 #define init_idle() clear_bit(smp_processor_id(), &wait_init_idle);
 
-
+extern spinlock_t schedule_lock[NR_CPUS] __cacheline_aligned;
 
 /*
  * Scheduler functions (in schedule.c)
@@ -247,8 +247,10 @@ long sched_bvtctl(unsigned long ctx_allow);
 long sched_adjdom(int dom, unsigned long mcu_adv, unsigned long warp, 
                   unsigned long warpl, unsigned long warpu);
 void init_idle_task(void);
-int  wake_up(struct task_struct *p);
+void __wake_up(struct task_struct *p);
+void wake_up(struct task_struct *p);
 long do_yield(void);
+unsigned long __reschedule(struct task_struct *p);
 void reschedule(struct task_struct *p);
 
 /* NB. Limited entry in Xen. Not for arbitrary use! */
diff --git a/xenolinux-2.4.22-sparse/arch/xeno/kernel/traps.c b/xenolinux-2.4.22-sparse/arch/xeno/kernel/traps.c
index b9159df6df..cb45e1c278 100644
--- a/xenolinux-2.4.22-sparse/arch/xeno/kernel/traps.c
+++ b/xenolinux-2.4.22-sparse/arch/xeno/kernel/traps.c
@@ -305,6 +305,24 @@ DO_ERROR(18, SIGBUS, "machine check", machine_check)
 
 asmlinkage void do_general_protection(struct pt_regs * regs, long error_code)
 {
+	/*
+	 * If we trapped on an LDT access then ensure that the default_ldt is
+	 * loaded, if nothing else. We load default_ldt lazily because LDT
+	 * switching costs time and many applications don't need it.
+	 */
+	if ( unlikely((error_code & 6) == 4) )
+	{
+		unsigned long ldt;
+		flush_page_update_queue(); /* ensure LDTR is up to date */
+		__asm__ __volatile__ ( "sldt %0" : "=r" (ldt) );
+		if ( likely(ldt == 0) )
+		{
+			queue_set_ldt((unsigned long)&default_ldt[0], 5);
+			flush_page_update_queue();
+			return;
+		}
+	}
+
 	if (!(regs->xcs & 2))
 		goto gp_in_kernel;
 
diff --git a/xenolinux-2.4.22-sparse/include/asm-xeno/desc.h b/xenolinux-2.4.22-sparse/include/asm-xeno/desc.h
index c417cbe807..545b7f8256 100644
--- a/xenolinux-2.4.22-sparse/include/asm-xeno/desc.h
+++ b/xenolinux-2.4.22-sparse/include/asm-xeno/desc.h
@@ -18,18 +18,20 @@ extern struct desc_struct default_ldt[];
 
 static inline void clear_LDT(void)
 {
-    queue_set_ldt((unsigned long)&default_ldt[0], 5);
+    /*
+     * NB. We load the default_ldt for lcall7/27 handling on demand, as
+     * it slows down context switching. Noone uses it anyway.
+     */
+    queue_set_ldt(0, 0);
 }
 
 static inline void load_LDT(struct mm_struct *mm)
 {
     void *segments = mm->context.segments;
-    int count = LDT_ENTRIES;
+    int count = 0;
 
-    if (!segments) {
-        segments = &default_ldt[0];
-        count = 5;
-    }
+    if ( unlikely(segments != NULL) )
+        count = LDT_ENTRIES;
          
     queue_set_ldt((unsigned long)segments, count);
 }
-- 
2.30.2